import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, concat, countDistinct, to_timestamp, year, count
from pyspark.sql.types import StructType, ArrayType, StringType, StructField, IntegerType, BooleanType, FloatType, \
    DoubleType

os.environ.setdefault('HADOOP_HOME', 'D:\\hadoop-2.9.2')

# Driver
spark = SparkSession \
    .builder \
    .master('local') \
    .appName('HelloSpark') \
    .getOrCreate()

schema = StructType([
    StructField("id", IntegerType()),
    StructField("year", IntegerType()),
    StructField("month", IntegerType()),
    StructField("day", IntegerType()),
    StructField("hour", IntegerType()),
    StructField("season", IntegerType()),
    StructField("pm", DoubleType())
])

df = spark.read.schema(schema) \
    .option('header', True) \
    .csv('dataset/beijingpm_with_nan.csv')

df.printSchema()
df.show(truncate=False)

df.na.drop(how='all', subset=['season', 'pm']).show()
df.na.drop(how='any', subset=['season', 'pm']).show()

# 2. 填充
df.na.fill(0).show()
df.na.fill(1, ['pm']).show()
df.na.fill({'season': 1, 'pm': 50}).show(500)
